import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve,auc
from sklearn.tree import export_graphviz
from matplotlib.colors import ListedColormap
from IPython.display import Image
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
dataset = pd.read_csv('lendingdata_preprocessed',index_col=[0])
dataset
y = dataset.status.values
X = dataset.drop(['status'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
pca = PCA(n_components = 200)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Dataset Explained Variance')
plt.show()
lr= LogisticRegression(random_state=0,max_iter=4000)
selector = RFECV(estimator = lr ,step =1 , cv=StratifiedKFold(10),scoring='accuracy')
selector.fit(X_train,y_train)
print("Optimal number of features : %d" % selector.n_features_)
plt.figure(figsize=(15,8))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
plt.show()
#classifier.fit(X_train, y_train)
selected = X.columns[(selector.get_support())]
print(selected)
X_train_opt = selector.transform(X_train)
X_test_opt = selector.transform(X_test)
lr = LogisticRegression(random_state=0)
parameters = [{'penalty':['l2'], 'solver': ['newton-cg','saga'],'C': [0.001,0.01,1,5,10,20],
'class_weight':['balanced','dict']},
{'penalty':['l1'],'solver':['liblinear','saga'],'C': [0.001,0.01,1,5,5,10,20]}]
grid_search = GridSearchCV(estimator = lr,
param_grid = parameters,
scoring = 'accuracy',
cv = 10,
n_jobs = -1)
grid_search = grid_search.fit(X_train_opt, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy "+str(best_accuracy))
print("Best Parameters "+str(best_parameters))
lr = LogisticRegression(penalty='l2',C=25,random_state=0,class_weight='dict',solver='newton-cg')
#lr = LogisticRegression(penalty='l1',C=1,random_state=0,solver='liblinear')
lr.fit(X_train_opt,y_train)
y_pred = lr.predict(X_test_opt)
train = round((lr.score(X_train_opt,y_train))*100,1)
test = round((lr.score(X_test_opt,y_test))*100,1)
print("training accuracy " + str(train))
print("testing accuracy " + str(test))
ff = pd.DataFrame(data = X_train_opt[0:,0:],
index = [i for i in range(X_train_opt.shape[0])],
columns = selected)
f = pd.DataFrame(data = X_test_opt[0:,0:],
index = [i for i in range(X_test_opt.shape[0])],
columns = selected)
gg = pd.DataFrame(data = y_train,
index = [i for i in range(y_train.shape[0])],
columns = ['status'])
g = pd.DataFrame(data = y_test,
index = [i for i in range(y_test.shape[0])],
columns = ['status'])
df1 = pd.concat([ff,f],axis=0)
df2 = pd.concat([gg,g],axis=0)
df = pd.concat([df1,df2],axis =1 )
df.to_csv('loan_data_opt.csv')
accuracies = cross_val_score(estimator = lr, X = X_train_opt, y = y_train, cv = 10)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cr = classification_report(y_test,y_pred)
print(cr)
print("Cross Validation Score " + str(accuracies.mean()*100))
X_Train_embedded = TSNE(n_components=2).fit_transform(X_test_opt)
print (X_Train_embedded.shape)
# create meshgrid
resolution = 100 # 100x100 background pixels
X2d_xmin, X2d_xmax = np.min(X_Train_embedded[:,0]), np.max(X_Train_embedded[:,0])
X2d_ymin, X2d_ymax = np.min(X_Train_embedded[:,1]), np.max(X_Train_embedded[:,1])
xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution), np.linspace(X2d_ymin, X2d_ymax, resolution))
# approximate Voronoi tesselation on resolution x resolution grid using 1-NN
background_model = KNeighborsClassifier(n_neighbors=1).fit(X_Train_embedded, y_pred)
voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
voronoiBackground = voronoiBackground.reshape((resolution, resolution))
#plot
plt.contourf(xx, yy, voronoiBackground)
plt.scatter(X_Train_embedded[:,0], X_Train_embedded[:,1], c=y_test)
plt.show()
import joblib
filename = 'logistc_model.sav'
joblib.dump(lr, filename)
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test_opt, y_test)
print(result)
knn = KNeighborsClassifier(metric='minkowski',n_neighbors=20,p=2,weights='uniform')
knn.fit(X_train, y_train)
train = round((knn.score(X_train,y_train))*100,1)
test = round((knn.score(X_test,y_test))*100,1)
y_pred = knn.predict(X_test)
print("training accuracy " + str(train))
print("testing accuracy " + str(test))
knn = KNeighborsClassifier(metric='minkowski',n_neighbors=20,p=2,weights='uniform')
knn.fit(X_train_pca, y_train)
train = round((knn.score(X_train_pca,y_train))*100,1)
test = round((knn.score(X_test_pca,y_test))*100,1)
y_pred = knn.predict(X_test_pca)
print("training accuracy " + str(train))
print("testing accuracy " + str(test))
accuracies = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 10)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cr = classification_report(y_test,y_pred)
print(cr)
print("Cross Validation Score " + str(accuracies.mean()*100))
accuracies = cross_val_score(estimator = knn, X = X_train_pca, y = y_train, cv = 10)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cr = classification_report(y_test,y_pred)
print(cr)
print("Cross Validation Score " + str(accuracies.mean()*100))
import joblib
filename = 'KNN_model.sav'
joblib.dump(knn, filename)
load_model = joblib.load(filename)
result = load_model.score(X_test_pca, y_test)
print(result)
column_labels = X.columns.values
rfc = RandomForestClassifier(random_state=0,n_estimators=1000)
rfc.fit(X_train,y_train)
importances = rfc.feature_importances_
f_importances = pd.Series(importances, X.columns)
f_importances.sort_values(ascending=False, inplace=True)
f_importances
sfm = SelectFromModel(rfc, threshold=0.035)
sfm.fit(X_train, y_train)
print("Important Features")
a = []
for feature_list_index in sfm.get_support(indices=True):
print("Feature "+ str(feature_list_index) + " : " + column_labels[feature_list_index])
a.append(column_labels[feature_list_index])
X_train_imp = sfm.transform(X_train)
X_test_imp = sfm.transform(X_test)
ff = pd.DataFrame(data = X_train_imp[0:,0:],
index = [i for i in range(X_train_opt.shape[0])],
columns = a)
f = pd.DataFrame(data = X_test_imp[0:,0:],
index = [i for i in range(X_test_opt.shape[0])],
columns = a)
gg = pd.DataFrame(data = y_train,
index = [i for i in range(y_train.shape[0])],
columns = ['status'])
g = pd.DataFrame(data = y_test,
index = [i for i in range(y_test.shape[0])],
columns = ['status'])
df1 = pd.concat([ff,f],axis=0)
df2 = pd.concat([gg,g],axis=0)
df = pd.concat([df1,df2],axis =1 )
df.to_csv('loan_data_imp.csv')
rfc = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=-1)
#classifier = RandomForestClassifier(random_state=0,n_estimators=1000,criterion="gini")
rfc.fit(X_train, y_train)
train = round((rfc.score(X_train,y_train))*100,1)
test = round((rfc.score(X_test,y_test))*100,1)
y_pred = rfc.predict(X_test)
print("training accuracy " + str(train))
print("testing accuracy" + str(test))
rfc = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=-1)
#classifier = RandomForestClassifier(random_state=0,n_estimators=1000,criterion="gini")
rfc.fit(X_train_imp, y_train)
train = round((rfc.score(X_train_imp,y_train))*100,1)
test = round((rfc.score(X_test_imp,y_test))*100,1)
y_pred = rfc.predict(X_test_imp)
print("training accuracy " + str(train))
print("testing accuracy " + str(test))
accuracies = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cr = classification_report(y_test,y_pred)
print(cr)
print("Cross Validation Score " + str(accuracies.mean()*100))
accuracies = cross_val_score(estimator = rfc, X = X_train_imp, y = y_train, cv = 10)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cr = classification_report(y_test,y_pred)
print(cr)
print("Cross Validation Score " + str(accuracies.mean()*100))
estimator = rfc.estimators_[20]
export_graphviz(estimator, out_file='tree.dot',
feature_names = a,
class_names = ['funded','not_funded'],
rounded = True, proportion = False,
precision = 2, filled = True)
os.system('dot -Tpng tree.dot -o tree.png')
Image(filename = 'tree.png')
model = RandomForestClassifier(max_depth=3,n_estimators=1000,random_state=0,n_jobs=-1)
model.fit(X_train_imp,y_train)
estimator_limited = model.estimators_[5]
estimator_limited
export_graphviz(estimator_limited, out_file='tree_limited.dot',
feature_names = a,
class_names = ['funded','not_funded'],
rounded = True, proportion = False,
precision = 2, filled = True)
os.system('dot -Tpng tree_limited.dot -o tree_limited.png')
Image(filename = 'tree_limited.png')
filename = 'RFC_model.sav'
joblib.dump(rfc, filename)
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test_imp, y_test)
print(result)
svm = SVC(kernel='rbf',random_state=0,probability=True,gamma='auto').fit(X_train_pca,y_train)
y_pred = svm.predict(X_test_pca)
train = round((svm.score(X_train_pca,y_train))*100,1)
test = round((svm.score(X_test_pca,y_test))*100,1)
print("training accuracy " + str(train))
print("testing accuracy" + str(test))
filename = 'SVM_model.sav'
joblib.dump(svm, filename)
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test_pca, y_test)
print(result)
dc = DecisionTreeClassifier(random_state=0)
parameters = [{'criterion':['gini','entropy'], 'splitter': ['best','random'],'max_depth': [4,6,8,12]}]
grid_search = GridSearchCV(estimator = dc,
param_grid = parameters,
scoring = 'accuracy',
cv = 10,
n_jobs = -1)
grid_search = grid_search.fit(X_train_opt, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy "+str(best_accuracy))
print("Best Parameters "+str(best_parameters))
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0,max_depth=8)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
train = round((dt.score(X_train,y_train))*100,1)
test = round((dt.score(X_test,y_test))*100,1)
print("training accuracy " + str(train))
print("testing accuracy" + str(test))
dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0,max_depth=8)
dtc.fit(X_train_imp, y_train)
y_pred = dtc.predict(X_test_imp)
train = round((dtc.score(X_train_imp,y_train))*100,1)
test = round((dtc.score(X_test_imp,y_test))*100,1)
print("training accuracy " + str(train))
print("testing accuracy" + str(test))
accuracies = cross_val_score(estimator = dt, X = X_train, y = y_train, cv = 10)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cr = classification_report(y_test,y_pred)
print(cr)
print("Cross Validation Score " + str(accuracies.mean()*100))
accuracies = cross_val_score(estimator = dtc, X = X_train_imp, y = y_train, cv = 10)
cm = confusion_matrix(y_test, y_pred)
print(cm)
cr = classification_report(y_test,y_pred)
print(cr)
print("Cross Validation Score " + str(accuracies.mean()*100))
import joblib
filename = 'DT_model.sav'
joblib.dump(dtc, filename)
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test_imp, y_test)
print(result)
plt.figure(figsize=(20,12))
plt.plot([0,1],[0,1],'r--')
# LR
probs = lr.predict_proba(X_test_opt)
probs = probs[:,1]
fpr, tpr , thresholds = roc_curve(y_test,probs)
roc_auc = auc(fpr,tpr)
label_ = 'Logistic Regression Classifier AUC '+'{0:.2f}'.format(roc_auc)
plt.plot(fpr,tpr,c='g',label=label_, linewidth =3)
# KNN
probs = knn.predict_proba(X_test_pca)
probs = probs[:,1]
fpr, tpr , thresholds = roc_curve(y_test,probs)
roc_auc = auc(fpr,tpr)
label = 'KNN Classifier AUC '+'{0:.2f}'.format(roc_auc)
plt.plot(fpr,tpr,c='c',label=label, linewidth =3)
# RFC
probs = rfc.predict_proba(X_test_imp)
probs = probs[:,1]
fpr, tpr , thresholds = roc_curve(y_test,probs)
roc_auc = auc(fpr,tpr)
label = 'Random Forest Classifier AUC '+'{0:.2f}'.format(roc_auc)
plt.plot(fpr,tpr,c='m',label=label, linewidth =3)
# SVM
probs = svm.predict_proba(X_test_pca)
probs = probs[:,1]
fpr, tpr , thresholds = roc_curve(y_test,probs)
roc_auc = auc(fpr,tpr)
label = 'SVM Classifier AUC '+'{0:.2f}'.format(roc_auc)
plt.plot(fpr,tpr,c='y',label=label, linewidth =3)
# DTC
probs = dtc.predict_proba(X_test_imp)
probs = probs[:,1]
fpr, tpr , thresholds = roc_curve(y_test,probs)
roc_auc = auc(fpr,tpr)
label = 'Decision Tree Classifier AUC '+'{0:.2f}'.format(roc_auc)
plt.plot(fpr,tpr,c='b',label=label, linewidth =3)
plt.xlabel('False Positive Rate',fontsize = 16)
plt.ylabel('True Positive Rate',fontsize = 16)
plt.title('Receiver Operating Characteristic',fontsize = 16)
plt.legend(loc='lower right',fontsize = 16)
# length of the test data
total = len(y_test)
# Counting '1' labels in test data
one_count = np.sum(y_test)
# counting '0' lables in test data
zero_count = total - one_count
plt.figure(figsize = (15, 8))
plt.plot([0, total], [0, one_count], c = 'r', linestyle = '--', label = 'Random Model')
plt.legend()
lm = [y for _, y in sorted(zip(y_pred, y_test), reverse = True)]
p = np.arange(0, total + 1) #p -> X_values
q = np.append([0], np.cumsum(lm)) # 1 -> Y Values
plt.plot(p, q, c = 'b', label = 'Random Forest classifier', linewidth = 2)
plt.legend()
plt.plot([0, one_count, total], [0, one_count, one_count], c = 'grey', linewidth = 2, label = 'Perfect Model')
plt.legend()
plt.xlabel("Total Observations",fontsize=16)
plt.ylabel("Funded Observations",fontsize=16)
plt.title("Cumulative Accuracy Profile",fontsize=16)
a = auc([0, total], [0, one_count])
aP = auc([0, one_count, total], [0, one_count, one_count]) -a
aR = auc(p, q)-a
print("Accuracy Rate : " ,aR/aP)